import os
import re
import json
import pandas as pd

def parse_noise_from_filename(filename: str):
    """
    Parse noise value from filename.
    Expected format: truth_proportion_summary_AN_noiseDDD.json
    Example: truth_proportion_summary_AN_noise005.json -> 0.05
    """
    m = re.search(r"truth_proportion_summary_AN_noise(\d+)\.json$", filename)
    if m:
        return int(m.group(1)) / 100.0
    return None

def json_to_tables(main_json_file: str, root_dir: str, output_file: str):
    """
    main_json_file: Path to main JSON (generates Sheet1 and Sheet2)
    root_dir: Root directory to recursively search for JSON files containing "truth_proportion_summary_AN_noise"
    output_file: Output xlsx path
    """
    # ---------- Process main_json_file -> Sheet1 (subdirs) & Sheet2 (global) ----------
    if not os.path.isfile(main_json_file):
        raise FileNotFoundError(f"main_json_file not found: {main_json_file}")

    with open(main_json_file, "r", encoding="utf-8") as f:
        main_data = json.load(f)

    # Sheet1: subdirs (only unique_ratio, nontrivial_ratio)
    rows_subdirs = []
    for inout_key, inout_val in main_data.items():
        for subdir_key, subdir_val in inout_val.get("subdirs", {}).items():
            noise0 = subdir_val.get("noise_proportions", {}).get("noise_0", {})
            rows_subdirs.append({
                "inout": inout_key,
                "subdir": subdir_key,
                "unique_ratio": noise0.get("unique_ratio"),
                "nontrivial_ratio": noise0.get("nontrivial_ratio"),
            })

    df_subdirs = pd.DataFrame(rows_subdirs)
    # Sort by unique_ratio in descending order (missing values will be placed last)
    if not df_subdirs.empty:
        df_subdirs = df_subdirs.sort_values(by="unique_ratio", ascending=False)

    # Sheet2: global-level from main JSON
    rows_global = []
    for inout_key, inout_val in main_data.items():
        size_noise0 = inout_val.get("size_level_stats", {}).get("noise_0", {})
        rows_global.append({
            "inout": inout_key,
            "avg_unique_across_subdirs": inout_val.get("avg_unique_across_subdirs"),
            "avg_full_across_subdirs": inout_val.get("avg_full_across_subdirs"),
            "global_unique_ratio": inout_val.get("global_unique_ratio"),
            "global_nontrivial_ratio": inout_val.get("global_nontrivial_ratio"),
            "total_vectors": size_noise0.get("total_vectors"),
            "total_files": size_noise0.get("total_files"),
            "unique_patterns": size_noise0.get("unique_patterns"),
            "max_repeat_count": size_noise0.get("max_repeat_count"),
            "max_repeat_ratio": size_noise0.get("max_repeat_ratio"),
            "avg_repeat": size_noise0.get("avg_repeat"),
        })
    df_global = pd.DataFrame(rows_global)

    # ---------- Traverse root_dir to collect all truth_proportion_summary_AN_noise*.json files ----------
    rows_noise_subdirs = []     # for Sheet3
    rows_noise_global = []      # for Sheet4 (global per file)
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for fname in filenames:
            if not fname.endswith(".json"):
                continue
            if "truth_proportion_summary_AN_noise" not in fname:
                continue
            noise_val = parse_noise_from_filename(fname)
            if noise_val is None:
                # Skip if parsing fails
                continue

            fullpath = os.path.join(dirpath, fname)
            try:
                with open(fullpath, "r", encoding="utf-8") as f:
                    jdata = json.load(f)
            except Exception as e:
                print(f"Warning: failed to load {fullpath}: {e}")
                continue

            # Subdirectory level -> Sheet3 rows
            for inout_key, inout_val in jdata.items():
                for subdir_key, subdir_val in inout_val.get("subdirs", {}).items():
                    noise0 = subdir_val.get("noise_proportions", {}).get("noise_0", {})
                    rows_noise_subdirs.append({
                        "noise": noise_val,
                        "inout": inout_key,
                        "subdir": subdir_key,
                        "unique_ratio": noise0.get("unique_ratio"),
                        "nontrivial_ratio": noise0.get("nontrivial_ratio"),
                    })

                # Also collect global-level metrics for this inout in the file -> for Sheet4
                size_noise0 = inout_val.get("size_level_stats", {}).get("noise_0", {})
                rows_noise_global.append({
                    "noise": noise_val,
                    "inout": inout_key,
                    "avg_unique_across_subdirs": inout_val.get("avg_unique_across_subdirs"),
                    "avg_full_across_subdirs": inout_val.get("avg_full_across_subdirs"),
                    "global_unique_ratio": inout_val.get("global_unique_ratio"),
                    "global_nontrivial_ratio": inout_val.get("global_nontrivial_ratio"),
                    "total_vectors": size_noise0.get("total_vectors"),
                    "total_files": size_noise0.get("total_files"),
                    "unique_patterns": size_noise0.get("unique_patterns"),
                    "max_repeat_count": size_noise0.get("max_repeat_count"),
                    "max_repeat_ratio": size_noise0.get("max_repeat_ratio"),
                    "avg_repeat": size_noise0.get("avg_repeat"),
                })

    df_noise_subdirs = pd.DataFrame(rows_noise_subdirs)
    df_noise_global = pd.DataFrame(rows_noise_global)

    # Sheet4: Aggregate by (noise, inout) (take mean), columns consistent with Sheet2 (excluding subdir)
    if not df_noise_global.empty:
        df_global_by_noise = (
            df_noise_global
            .groupby(["noise", "inout"], as_index=False)
            .mean(numeric_only=True)
        )
    else:
        df_global_by_noise = pd.DataFrame(columns=[
            "noise", "inout",
            "avg_unique_across_subdirs", "avg_full_across_subdirs",
            "global_unique_ratio", "global_nontrivial_ratio",
            "total_vectors", "total_files", "unique_patterns",
            "max_repeat_count", "max_repeat_ratio", "avg_repeat"
        ])

    # ---------- Save to Excel (multiple sheets) ----------
    with pd.ExcelWriter(output_file) as writer:
        # Sheet1: subdirs (from main json) - only unique_ratio/nontrivial_ratio + inout/subdir
        df_subdirs.to_excel(writer, sheet_name="subdirs", index=False)

        # Sheet2: global (from main json)
        df_global.to_excel(writer, sheet_name="global", index=False)

        # Sheet3: merged subdirs across noise files (with noise column)
        if not df_noise_subdirs.empty:
            df_noise_subdirs.to_excel(writer, sheet_name="subdirs_with_noise", index=False)
        else:
            # Write empty headers to keep the sheet
            pd.DataFrame(columns=["noise", "inout", "subdir", "unique_ratio", "nontrivial_ratio"])\
              .to_excel(writer, sheet_name="subdirs_with_noise", index=False)

        # Sheet4: global metrics aggregated by (noise, inout)
        df_global_by_noise.to_excel(writer, sheet_name="global_by_noise", index=False)

    print(f"Saved summary to: {output_file}")


if __name__ == "__main__":
    # Example usage - modify with your paths
    main_json_file = "results/truth_proportion_summary_ANO.json"  # For Sheet1 and Sheet2
    root_dir = ""  # Recursively search for files containing truth_proportion_summary_AN_noise
    output_file = "results/summary_ANO.xlsx"

    json_to_tables(main_json_file, root_dir, output_file)